library(Seurat)
library(Matrix)
library(ggplot2)
library(ggpubr)
library(MAST)
library(scCustomize)
library(tidyverse)
# Load counts
counts_matrix <- readRDS("~/immune_cells/scRNAseq_analysis/Expression_matrix/Nvec_sc_raw_counts.rds")
counts_matrix <- as.matrix(counts_matrix)
# Create Seurat object
seurat_object <- CreateSeuratObject(counts = counts_matrix)
## Warning: Feature names cannot have
## underscores ('_'), replacing with dashes
## ('-')
## Warning: Data is of class matrix. Coercing
## to dgCMatrix.
# Read metadata tables
cellid_to_mc <- read.delim(
"~/immune_cells/scRNAseq_analysis/Expression_matrix/CellID_to_MC_table.txt",
header = FALSE
)
exp_class <- read.delim(
"~/immune_cells/scRNAseq_analysis/Expression_matrix/Nvec_sc_experiment_classification.txt",
header = FALSE
)
colnames(cellid_to_mc) <- c("CellName", "CellID")
colnames(exp_class) <- c("CellName", "Condition")
# Merge metadata (by cell barcode)
meta <- merge(cellid_to_mc, exp_class, by = "CellName", all = FALSE)
# Keep only cells that exist in Seurat object
meta <- meta[meta$CellName %in% colnames(seurat_object), ]
# Put CellName as rownames so Seurat can align safely
rownames(meta) <- meta$CellName
meta$CellName <- NULL
# Make sure I am not missing anything unexpectedly
stopifnot(all(rownames(meta) %in% colnames(seurat_object)))
# Add once (Seurat will match by rownames)
seurat_object <- AddMetaData(seurat_object, metadata = meta)
stopifnot(!anyDuplicated(rownames(meta)))
stopifnot(!anyNA(meta$CellID), !anyNA(meta$Condition))
table(table(rownames(meta)))
##
## 1
## 11550
unique(seurat_object@meta.data$Condition)
## [1] "iHCl" "Ctrl" "tPIC"
#Replace with the correct conditions
seurat_object$Condition <- dplyr::recode(seurat_object$Condition,
"iHCl" = "NaCl",
"tPIC" = "pIC")
seurat <- seurat_object
# Pre-processing and QC ---------------------------------------------------
setwd("~/immune_cells/scRNAseq_analysis/seurat_pipeline/")
#saveRDS(seurat, "seurat_unprocessed.rds")
seurat <- readRDS("seurat_unprocessed.rds")
# Visualize QC metrics as a violin plot
VlnPlot(seurat,
features = c("nFeature_RNA", "nCount_RNA"),
ncol = 2)
## Warning: Default search for "data" layer in
## "RNA" assay yielded no results; utilizing
## "counts" layer instead.

# nCount_RNA vs nFeature_RNA: depth–complexity relationship per cell - looks good.
# nFeature_RNA violin: gene detection comparability across identities - looks good.
# Assess relationship between sequencing depth (UMI counts) and transcriptome complexity (detected genes) per cell.
# Check if there is any difference between the biological replicates
FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

# Overall, looks clean. Minimal filtering is needed.
#-----------------------------#
# QC filtering thresholds
#-----------------------------#
MIN_FEATURES <- 500
MAX_FEATURES <- 3000
MAX_COUNTS <- 10000
MIN_CELLS_PER_GENE <- 10
REMOVE_PATTERN <- "orphan" # Remove "orphan" genes which are irrelevant for this project.
#-----------------------------#
# Data before filtering
#-----------------------------#
qc_before <- seurat@meta.data[, c("nFeature_RNA", "nCount_RNA")]
qc_before$stage <- "Before"
#-----------------------------#
# Cell filtering
#-----------------------------#
seurat_filt <- subset(seurat,
subset = nFeature_RNA > MIN_FEATURES &
nFeature_RNA < MAX_FEATURES &
nCount_RNA < MAX_COUNTS)
#-----------------------------#
# Gene filtering
#-----------------------------#
counts_mat <- GetAssayData(seurat_filt, assay = "RNA", layer = "counts")
keep_by_detection <- Matrix::rowSums(counts_mat > 0) >= MIN_CELLS_PER_GENE
keep_by_name <- !grepl(REMOVE_PATTERN, rownames(seurat_filt), ignore.case = TRUE)
seurat_filt <- seurat_filt[keep_by_detection & keep_by_name, ]
stopifnot(!any(grepl(
REMOVE_PATTERN, rownames(seurat_filt), ignore.case = TRUE
)))
#-----------------------------#
# Data after filtering
#-----------------------------#
qc_after <- seurat_filt@meta.data[, c("nFeature_RNA", "nCount_RNA")]
qc_after$stage <- "After"
qc_all <- rbind(qc_before, qc_after)
# Quick summaries
cat("Cells before:", nrow(qc_before), "\n")
## Cells before: 11550
cat("Cells after :", nrow(qc_after), "\n")
## Cells after : 11065
cat("Genes before:", nrow(seurat), "\n")
## Genes before: 21329
cat("Genes after :", nrow(seurat_filt), "\n")
## Genes after : 16535
#-----------------------------#
# Plots with thresholds
#-----------------------------#
# Scatter: nCount vs nFeature (Before/After) + threshold lines
p_scatter <- ggplot(qc_all, aes(x = nCount_RNA, y = nFeature_RNA)) +
geom_point(size = 0.4, alpha = 0.35) +
facet_wrap( ~ stage) +
geom_vline(xintercept = MAX_COUNTS, linetype = "dashed") +
geom_hline(yintercept = MIN_FEATURES, linetype = "dashed") +
geom_hline(yintercept = MAX_FEATURES, linetype = "dashed") +
theme_classic() +
labs(title = "QC space before/after filtering", x = "nCount_RNA (UMIs per cell)", y = "nFeature_RNA (genes detected per cell)")
# Violin: nFeature_RNA + thresholds
p_vln_feat <- ggplot(qc_all, aes(x = stage, y = nFeature_RNA)) +
geom_violin() +
geom_jitter(width = 0.2,
size = 0.25,
alpha = 0.25) +
geom_hline(yintercept = MIN_FEATURES, linetype = "dashed") +
geom_hline(yintercept = MAX_FEATURES, linetype = "dashed") +
theme_classic() +
labs(title = "nFeature_RNA before/after", x = NULL, y = "nFeature_RNA")
# Violin: nCount_RNA + threshold
p_vln_count <- ggplot(qc_all, aes(x = stage, y = nCount_RNA)) +
geom_violin() +
geom_jitter(width = 0.2,
size = 0.25,
alpha = 0.25) +
geom_hline(yintercept = MAX_COUNTS, linetype = "dashed") +
theme_classic() +
labs(title = "nCount_RNA before/after", x = NULL, y = "nCount_RNA")
# Print plots
p_scatter

p_vln_feat

p_vln_count

# Save filtered object
#saveRDS(seurat_filt, "seurat_filt.RDS")
# Normalization and transformation ----------------------------------------
# I used the sctransform R package (https://github.com/satijalab/sctransform) to preform normalization and transformation
# I ran the following command in R on the cluster:
# library(sctransform)
# seurat <- SCTransform(
# seurat,
# verbose = FALSE
# )
# Then I saved the SCT object:
# saveRDS(seurat, "seurat_SCT.rds")
# Dimension reduction and clustering -------------------------------------
seurat_SCT <- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_SCT.rds")
# These are now standard steps in the Seurat workflow for visualization and clustering
seurat <- RunPCA(seurat_SCT, verbose = FALSE)
# Inspect the elbow plot
ElbowPlot(seurat, ndims = 50)

# PC 1 to 30 explain most of the variance
seurat <- RunUMAP(seurat, dims = 1:30, verbose = FALSE)
seurat <- FindNeighbors(seurat, dims = 1:30, verbose = FALSE)
# Louvain clustering
# I intentionally used low resolution to avoid over-clustering and to focus on main cell types in early embryos
# This allowed the detection of neurons, gland cells, and even a tiny cnidocyte population
seurat <- FindClusters(seurat, verbose = FALSE, resolution = 0.2)
# Visualize
DimPlot(seurat, label = TRUE)

scCustomize::DimPlot_scCustom(
seurat,
label = T,
figure_plot = T,
colors_use = "Dark2",
shuffle = T
)
## Warning: Removed 1 row containing missing values or
## values outside the scale range
## (`geom_point()`).

# Save the clustered object
#saveRDS(seurat,
# "~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
#QC
DefaultAssay(seurat) <- "RNA"
FeaturePlot(seurat, features = c("nCount_RNA", "nFeature_RNA"))

# It looks relatively uniform:
# The UMAP structure was not shaped by sequencing depth or detected gene counts.
# I will proceed without further filtering.
# Analysis and visualization ---------------------------------------------
# Read the clustered object
seurat <- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
# Visualize replicates
# Extract replicate information from the cell names
seurat$Replicate <- ifelse(grepl("^Nvec01_", colnames(seurat)), "Replicate 1", "Replicate 2")
# Visualize UMAP, coloring cells by the 'Replicate' column
DimPlot(seurat, group.by = "Replicate")

# No batch effect - the two replicates are nearly identical.
# Visualize the distribution of cells by conditions
DimPlot(seurat, group.by = "Condition")

# poly(I:C) has a strong enrichment of cells in cluster 1
# Visualize metacells
DimPlot(seurat, group.by = "CellID")

# Difficult to see. Another method:
meta <- seurat@meta.data %>%
select(CellID, seurat_clusters)
# contingency table
mc_cluster_tab <- table(meta$CellID, meta$seurat_clusters)
mc_frac <- as.data.frame(mc_cluster_tab)
colnames(mc_frac) <- c("CellID", "Cluster", "n_cells")
mc_frac <- mc_frac %>%
group_by(CellID) %>%
mutate(total_cells = sum(n_cells),
frac = n_cells / total_cells) %>%
ungroup()
mc_cluster1 <- mc_frac %>%
filter(Cluster == "1") %>%
arrange(desc(frac))
head(mc_cluster1, 20)
## # A tibble: 20 × 5
## CellID Cluster n_cells total_cells frac
## <fct> <fct> <int> <int> <dbl>
## 1 141 1 62 62 1
## 2 146 1 71 71 1
## 3 155 1 50 50 1
## 4 156 1 67 67 1
## 5 158 1 67 67 1
## 6 159 1 52 52 1
## 7 160 1 56 56 1
## 8 161 1 77 77 1
## 9 163 1 45 45 1
## 10 165 1 33 33 1
## 11 166 1 40 40 1
## 12 167 1 75 75 1
## 13 168 1 63 63 1
## 14 169 1 56 56 1
## 15 170 1 55 55 1
## 16 174 1 62 62 1
## 17 175 1 44 44 1
## 18 176 1 68 68 1
## 19 177 1 79 79 1
## 20 178 1 93 93 1
# Mostly the metacells from 141 and above which fits Arnau's analysis.
# Visualize expression of genes of interest
FeaturePlot(seurat, features = "mCherry-plus-strand")

VlnPlot(seurat,
features = "mCherry-plus-strand",
pt.size = 0.2,
split.by = "Condition")

library(scCustomize)
# RlRb_1 "Nvec-vc1.1-XM-048731783.1"
FeaturePlot_scCustom(seurat, features = "Nvec-vc1.1-XM-048731783.1")

# RLRb_2 Nvec-vc1.1-XM-048731783.1
FeaturePlot_scCustom(seurat, features = "Nvec-vc1.1-XM-048731786.1")

# Visualize mCherry-plus-strand expression on UMAP grouped by 'Condition'
FeaturePlot_scCustom(seurat, features = "mCherry-plus-strand", split.by = "Condition")

# RLRb_1
FeaturePlot_scCustom(seurat, features = "Nvec-vc1.1-XM-048731783.1", split.by = "Condition")

# Check the correlation in the metacell expression matrix (uploaded to Zenodo).
f <- "~/working/scRNAseq Arnau/to_yehu/expression_matrices/Nvec_mc_umifrac.txt"
tab <- read.delim(header = T, f)
rows <- tab[c("mCherry_plus_strand", "Nvec_vc1.1_XM_048731783.1"), ]
df <- as.data.frame(t(rows))
# Create scatter plot with linear regression line and R-squared value
plot <- ggscatter(
df,
x = "Nvec_vc1.1_XM_048731783.1",
y = "mCherry_plus_strand",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
ggtheme = theme_pubr()
) +
stat_cor(aes(label = paste(..rr.label.., sep = "~`, `~")), label.x = 0.1, label.y = 2) +
labs(title = "", x = "RLRb UMI fraction", y = "mCherry UMI fraction") +
theme(text = element_text(size = 12))
# Save as PNG
# ggsave("scatter_plot_mc.png", plot, width = 7, height = 5, dpi = 300)
print(plot)

# RLRb_1 against RLRb_2
rows <- tab[c("Nvec_vc1.1_XM_048731783.1", "Nvec_vc1.1_XM_048731786.1"), ]
df <- as.data.frame(t(rows))
# Create scatter plot with linear regression line and R-squared value
plot <- ggscatter(
df,
x = "Nvec_vc1.1_XM_048731783.1",
y = "Nvec_vc1.1_XM_048731786.1",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
ggtheme = theme_pubr()
) +
stat_cor(aes(label = paste(..rr.label.., sep = "~`, `~")), label.x = 0.1, label.y = 2) +
labs(title = "", x = "RLRb_1 UMI fraction", y = "RLRb_2 UMI fraction") +
theme(text = element_text(size = 12))
print(plot)

# RLRb against GAPDH - as a sanity check
rows <- tab[c("Nvec_vc1.1_XM_048731783.1", "Nvec_vc1.1_XM_032382055.2"), ]
df <- as.data.frame(t(rows))
# Create scatter plot with linear regression line and R-squared value
plot <- ggscatter(
df,
x = "Nvec_vc1.1_XM_048731783.1",
y = "Nvec_vc1.1_XM_032382055.2",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
ggtheme = theme_pubr()
) +
stat_cor(aes(label = paste(..rr.label.., sep = "~`, `~")), label.x = 0.1, label.y = 2) +
labs(title = "", x = "RLRb_1 UMI fraction", y = "GAPDH UMI fraction") +
theme(text = element_text(size = 12))
print(plot)

# No correlation.
# Visualize the expression of mCherry per condition
gene_expression <- FetchData(seurat, vars = c("mCherry-plus-strand", "Condition"))
# Create a boxplot
ggplot(gene_expression,
aes(x = Condition, y = `mCherry-plus-strand`, fill = Condition)) +
geom_boxplot(outlier.shape = NA) + # Boxplot without outliers
geom_jitter(
width = 0.2,
alpha = 0.6,
color = "black",
size = 0.5
) + # Add jitter for individual cells
theme_minimal() +
labs(title = "Expression of GeneX across Conditions", x = "Condition", y = "Expression Level") +
scale_fill_brewer(palette = "Set2")

unique(gene_expression$Condition)
## [1] "iHCl" "Ctrl" "tPIC"
# Cleaner version
ggplot(gene_expression,
aes(x = Condition, y = `mCherry-plus-strand`, fill = Condition)) +
geom_boxplot(outlier.shape = NA) +
stat_compare_means(method = "wilcox.test",
label = "p.signif",
comparisons = list(c("iHCl", "Ctrl"), c("tPIC", "iHCl"), c("tPIC", "Ctrl"))) +
theme_minimal(base_size = 14) +
theme(
axis.text = element_text(size = 12, color = "black"),
axis.title = element_text(size = 14, face = "bold"),
legend.position = "none",
plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
) +
labs(title = "GeneX Expression Across Conditions", x = "Condition", y = "Expression Level") +
scale_fill_manual(values = c("#1f78b4", "#33a02c", "#e31a1c"))

# Visualize co-expression of RLRb and mCherry
umap_data <- FetchData(seurat, vars = c("umap_1", "umap_2", "mCherry-plus-strand", "Nvec-vc1.1-XM-048731783.1"))
umap_data$`mCherry-plus-strand` <- scales::rescale(umap_data$`mCherry-plus-strand`, to = c(0, 1))
umap_data$`Nvec-vc1.1-XM-048731783.1` <- scales::rescale(umap_data$`Nvec-vc1.1-XM-048731783.1`, to = c(0, 1))
ggplot(umap_data, aes(x = umap_1, y = umap_2)) +
geom_point(aes(color = rgb(`mCherry-plus-strand`,`Nvec-vc1.1-XM-048731783.1`,0)), size = 1) +
scale_color_identity() +
theme_minimal() +
labs(title = "Co-expression of mCherry and RLRb", x = "UMAP 1", y = "UMAP 2")

# Determine cell type proportions
# Create a contingency table of clusters and conditions
cluster_condition_table <- table(seurat$seurat_clusters, seurat$Condition)
# Convert to proportions
cluster_condition_proportions <- prop.table(cluster_condition_table, margin = 2)
# Plot using ggplot2
library(ggplot2)
data <- as.data.frame(as.table(cluster_condition_proportions))
colnames(data) <- c("Cluster", "Condition", "Proportion")
ggplot(data, aes(x = Condition, y = Proportion, fill = Cluster)) +
geom_bar(stat = "identity", position = "fill") +
theme_minimal() +
labs(y = "Proportion", x = "Condition", fill = "Cluster") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Activated versus control molecular profile ------------------------------
seurat <- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
# Subset cells
Idents(seurat) <- "Condition"
subset_seurat <- subset(seurat, idents = c("iHCl", "tPIC"))
# Use MAST for differential expression analysis between poly(I:C) ("tPIC") cells vsrsus
# NaCl cells ("iHCl).
#"MAST" : Identifies differentially expressed genes between two groups of cells using a hurdle model tailored to scRNA-seq data. Utilizes the MAST package to run the DE testing.
# This can take a few minutes to run
#de_results <- FindMarkers(
# seurat,
# ident.1 = "tPIC",
# ident.2 = "iHCl",
# test.use = "MAST"
#)
# Save the MAST resuls
#saveRDS(de_results, "~/immune_cells/scRNAseq_analysis/seurat_pipeline/MAST_pIC_NaCl.rds")
de_results<- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/MAST_pIC_NaCl.rds")
# Obtain annotations and load results
gene_names_df <- readRDS("~/immune_cells/scRNAseq_analysis/annotaion/peptides_annotation.rds")
de_results$gene_name <- rownames(de_results)
de_results <- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/MAST_pIC_NaCl.rds")
de_results$gene_name <- rownames(de_results)
# Merge the dataframes
merged_df <- merge(
de_results,
gene_names_df,
by.x = "gene_name",
by.y = "gene_name",
all.x = TRUE,
sort = FALSE
)
# Restore row names
rownames(merged_df) <- merged_df$gene_name
merged_df$gene_name <- NULL
# View the result
head(merged_df)
## p_val avg_log2FC
## Nvec-vc1.1-XM-001624274.3 0 4.031165
## Nvec-vc1.1-XM-001641466.3 0 4.311015
## Nvec-vc1.1-XM-001633857.3 0 6.198074
## Nvec-vc1.1-XM-001637235.3 0 2.528539
## Nvec-vc1.1-XM-048733148.1 0 3.828831
## Nvec-NVE814 0 3.691906
## pct.1 pct.2
## Nvec-vc1.1-XM-001624274.3 0.895 0.139
## Nvec-vc1.1-XM-001641466.3 0.847 0.098
## Nvec-vc1.1-XM-001633857.3 0.700 0.022
## Nvec-vc1.1-XM-001637235.3 0.850 0.197
## Nvec-vc1.1-XM-048733148.1 0.720 0.070
## Nvec-NVE814 0.909 0.265
## p_val_adj protein
## Nvec-vc1.1-XM-001624274.3 0
## Nvec-vc1.1-XM-001641466.3 0 ZCCHC24
## Nvec-vc1.1-XM-001633857.3 0
## Nvec-vc1.1-XM-001637235.3 0 IFI44
## Nvec-vc1.1-XM-048733148.1 0
## Nvec-NVE814 0
## domain
## Nvec-vc1.1-XM-001624274.3
## Nvec-vc1.1-XM-001641466.3 zf-3CxxC_2/
## Nvec-vc1.1-XM-001633857.3
## Nvec-vc1.1-XM-001637235.3 Septin/
## Nvec-vc1.1-XM-048733148.1
## Nvec-NVE814
# Visualize top genes using a heatmap
top_genes <- rownames(de_results[de_results$p_val_adj < 0.05 &
abs(de_results$avg_log2FC) > 2, ])
DoHeatmap(subset_seurat, features = top_genes) +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
## Warning in DoHeatmap(subset_seurat, features
## = top_genes): The following features were
## omitted as they were not found in the
## scale.data slot for the SCT assay:
## Nvec-vc1.1-XM-032378056.2,
## Nvec-vc1.1-XM-048732850.1,
## Nvec-vc1.1-XM-001624947.3,
## Nvec-vc1.1-XM-032370019.2,
## Nvec-vc1.1-XM-048719950.1,
## Nvec-vc1.1-XM-001641157.3,
## Nvec-vc1.1-XM-032366874.2,
## Nvec-vc1.1-XM-048721612.1,
## Nvec-vc1.1-XM-048720207.1,
## Nvec-vc1.1-XM-032375603.2, Nvec-v1g248480,
## Nvec-vc1.1-XM-032376441.2,
## Nvec-vc1.1-XM-048732024.1,
## Nvec-vc1.1-XM-032370497.2,
## Nvec-vc1.1-XM-048734095.1,
## Nvec-vc1.1-XM-048732901.1,
## Nvec-vc1.1-XM-048724448.1,
## Nvec-vc1.1-XM-001637953.3,
## Nvec-vc1.1-XM-001634990.3,
## Nvec-vc1.1-XM-032379398.2,
## Nvec-vc1.1-XM-048730483.1,
## Nvec-vc1.1-XM-032382277.2,
## Nvec-vc1.1-XM-048723374.1,
## Nvec-vc1.1-XM-048733294.1,
## Nvec-vc1.1-XM-048733785.1,
## Nvec-vc1.1-XM-048731819.1,
## Nvec-vc1.1-XM-032375574.2,
## Nvec-vc1.1-XM-001640215.3,
## Nvec-vc1.1-XM-032383310.2,
## Nvec-vc1.1-XM-032370334.2,
## Nvec-vc1.1-XM-048731845.1, Nvec-v1g239996,
## Nvec-vc1.1-XM-048725023.1,
## Nvec-vc1.1-XM-001623361.3,
## Nvec-vc1.1-XM-032377522.2,
## Nvec-vc1.1-XM-001629642.3,
## Nvec-vc1.1-XM-032379326.2,
## Nvec-vc1.1-XM-001634275.3,
## Nvec-vc1.1-XM-001637057.3,
## Nvec-vc1.1-XM-001634653.3,
## Nvec-vc1.1-XM-048720000.1,
## Nvec-vc1.1-XM-032375533.2,
## Nvec-vc1.1-XM-032366964.2,
## Nvec-vc1.1-XM-032384029.2,
## Nvec-vc1.1-XM-001622557.3,
## Nvec-vc1.1-XM-032376197.2,
## Nvec-vc1.1-XM-048724297.1,
## Nvec-vc1.1-XM-048731723.1,
## Nvec-vc1.1-XM-032367751.2,
## Nvec-vc1.1-XM-032373910.2,
## Nvec-vc1.1-XM-048720228.1,
## Nvec-vc1.1-XM-032371233.2,
## Nvec-vc1.1-XM-032363955.2,
## Nvec-vc1.1-XM-048725400.1, Nvec-NVE5267,
## Nvec-vc1.1-XM-048720243.1,
## Nvec-vc1.1-XM-001641093.3,
## Nvec-vc1.1-XM-032364864.2,
## Nvec-vc1.1-XM-032375542.2,
## Nvec-vc1.1-XM-032377357.1,
## Nvec-vc1.1-XM-048732880.1,
## Nvec-vc1.1-XM-032382196.2,
## Nvec-vc1.1-XM-048719942.1, Nvec-NVE4416,
## Nvec-vc1.1-XM-032387333.2,
## Nvec-vc1.1-XM-001630891.3,
## Nvec-vc1.1-XM-001636955.3,
## Nvec-vc1.1-XM-001635935.3,
## Nvec-vc1.1-XM-048724698.1,
## Nvec-vc1.1-XM-032386291.2,
## Nvec-vc1.1-XM-032371141.2,
## Nvec-vc1.1-XM-048732129.1,
## Nvec-vc1.1-XM-032371889.2,
## Nvec-vc1.1-XM-001622084.3,
## Nvec-vc1.1-XM-001624997.3,
## Nvec-vc1.1-XM-032383381.2,
## Nvec-vc1.1-XM-032370086.2,
## Nvec-vc1.1-XM-001629371.3, Nvec-NVE8786,
## Nvec-vc1.1-XM-032373982.2,
## Nvec-vc1.1-XM-048731263.1, Nvec-v1g198549,
## Nvec-NVE16861, Nvec-vc1.1-XM-032365042.2,
## Nvec-NVE6867, Nvec-vc1.1-XM-001624564.3,
## Nvec-vc1.1-XM-032377937.2,
## Nvec-vc1.1-XM-032371568.2,
## Nvec-vc1.1-XM-048722710.1,
## Nvec-vc1.1-XM-032385180.2,
## Nvec-vc1.1-XM-048725266.1,
## Nvec-vc1.1-XM-001629468.3,
## Nvec-vc1.1-XM-048722053.1,
## Nvec-vc1.1-XM-032361845.2,
## Nvec-vc1.1-XM-001629401.3,
## Nvec-vc1.1-XM-032379550.2,
## Nvec-vc1.1-XM-048733574.1, Nvec-v1g238666,
## Nvec-vc1.1-XM-032371512.2,
## Nvec-vc1.1-XM-048729958.1,
## Nvec-vc1.1-XM-001625476.3,
## Nvec-vc1.1-XM-032380165.2,
## Nvec-vc1.1-XM-048732153.1,
## Nvec-vc1.1-XM-048721536.1,
## Nvec-vc1.1-XM-048721923.1,
## Nvec-vc1.1-XM-032375432.2,
## Nvec-vc1.1-XM-032381215.2,
## Nvec-vc1.1-XM-032380820.2,
## Nvec-vc1.1-XM-048719359.1,
## Nvec-vc1.1-XM-032386576.2,
## Nvec-vc1.1-XM-048727579.1,
## Nvec-vc1.1-XM-032371443.2,
## Nvec-vc1.1-XM-048725640.1,
## Nvec-vc1.1-XM-001630717.3,
## Nvec-vc1.1-XM-032373008.2,
## Nvec-vc1.1-XM-048721807.1,
## Nvec-vc1.1-XM-032382427.2,
## Nvec-vc1.1-XM-048734392.1,
## Nvec-vc1.1-XM-032382117.2,
## Nvec-vc1.1-XM-032374183.2,
## Nvec-vc1.1-XM-048724139.1,
## Nvec-vc1.1-XM-048732603.1,
## Nvec-vc1.1-XM-048728763.1,
## Nvec-vc1.1-XM-032365231.2,
## Nvec-vc1.1-XM-048728010.1,
## Nvec-vc1.1-XM-032386292.2,
## Nvec-vc1.1-XM-001639717.3,
## Nvec-vc1.1-XM-032378472.2,
## Nvec-vc1.1-XM-048730254.1,
## Nvec-vc1.1-XM-032387348.2, Nvec-v1g244688,
## Nvec-vc1.1-XM-048720298.1,
## Nvec-vc1.1-XM-001627773.3,
## Nvec-vc1.1-XM-032371612.2, Nvec-v1g71242,
## Nvec-vc1.1-XM-001636093.3,
## Nvec-vc1.1-XM-001640804.3,
## Nvec-vc1.1-XM-032386416.2,
## Nvec-vc1.1-XM-032375704.2,
## Nvec-vc1.1-XM-032386262.2,
## Nvec-vc1.1-XM-048732025.1,
## Nvec-vc1.1-XM-048722141.1,
## Nvec-vc1.1-XM-032370184.2,
## Nvec-vc1.1-XM-048732499.1,
## Nvec-vc1.1-XM-048731669.1,
## Nvec-vc1.1-XM-032362841.2, Nvec-NVE17096,
## Nvec-vc1.1-XM-032376614.2,
## Nvec-vc1.1-XM-048725431.1,
## Nvec-vc1.1-XM-001621781.3,
## Nvec-vc1.1-XM-048729540.1, Nvec-NVE4470,
## Nvec-vc1.1-XM-032382934.2,
## Nvec-vc1.1-XM-048729646.1,
## Nvec-vc1.1-XM-048724213.1,
## Nvec-vc1.1-XM-001640257.3,
## Nvec-vc1.1-XM-032367138.2,
## Nvec-vc1.1-XM-048728490.1,
## Nvec-vc1.1-XM-032366225.2,
## Nvec-vc1.1-XM-048726085.1,
## Nvec-vc1.1-XM-048728039.1,
## Nvec-vc1.1-XM-001640389.3,
## Nvec-vc1.1-XM-032362439.2,
## Nvec-vc1.1-XM-032367008.2,
## Nvec-vc1.1-XM-048720937.1,
## Nvec-vc1.1-XM-032376553.2,
## Nvec-vc1.1-XM-001638897.3,
## Nvec-vc1.1-XM-048720727.1,
## Nvec-vc1.1-XM-048724505.1,
## Nvec-vc1.1-XM-032384878.2,
## Nvec-vc1.1-XM-048724737.1,
## Nvec-vc1.1-XM-032378236.2,
## Nvec-vc1.1-XM-032378366.2,
## Nvec-vc1.1-XM-032361875.2,
## Nvec-vc1.1-XM-032372440.2,
## Nvec-vc1.1-XM-032384966.2,
## Nvec-vc1.1-XM-001623817.3,
## Nvec-vc1.1-XM-032387110.2,
## Nvec-vc1.1-XM-032378771.2,
## Nvec-vc1.1-XM-048720426.1,
## Nvec-vc1.1-XM-048726678.1,
## Nvec-vc1.1-XM-032377385.2,
## Nvec-vc1.1-XM-048721756.1,
## Nvec-vc1.1-XM-032365561.2,
## Nvec-vc1.1-XM-032374044.2,
## Nvec-vc1.1-XM-048733389.1,
## Nvec-vc1.1-XM-001641694.3,
## Nvec-vc1.1-XM-001639126.3, Nvec-v1g240669,
## Nvec-vc1.1-XM-032365236.2,
## Nvec-vc1.1-XM-032384838.2,
## Nvec-vc1.1-XM-032385755.2,
## Nvec-vc1.1-XM-032384511.2,
## Nvec-vc1.1-XM-032381408.2,
## Nvec-vc1.1-XM-001635569.3,
## Nvec-vc1.1-XM-032377090.2,
## Nvec-vc1.1-XM-048732002.1,
## Nvec-vc1.1-XM-032378372.2,
## Nvec-vc1.1-XM-001634203.3,
## Nvec-vc1.1-XM-032386985.2,
## Nvec-vc1.1-XM-048726942.1,
## Nvec-vc1.1-XM-001641080.3,
## Nvec-vc1.1-XM-032370281.2,
## Nvec-vc1.1-XM-001628116.3,
## Nvec-vc1.1-XM-032367418.2,
## Nvec-vc1.1-XM-048730109.1,
## Nvec-vc1.1-XM-048733319.1,
## Nvec-vc1.1-XM-001638779.3,
## Nvec-vc1.1-XM-001627458.3,
## Nvec-vc1.1-XM-001640379.3,
## Nvec-vc1.1-XM-048720683.1,
## Nvec-vc1.1-XM-001630319.3,
## Nvec-vc1.1-XM-048734306.1,
## Nvec-vc1.1-XM-001639804.3,
## Nvec-vc1.1-XM-032376623.2,
## Nvec-vc1.1-XM-001638224.3,
## Nvec-vc1.1-XM-048721478.1,
## Nvec-vc1.1-XM-032383500.2,
## Nvec-vc1.1-XM-048734687.1,
## Nvec-vc1.1-XM-001631310.3,
## Nvec-vc1.1-XM-032367207.2,
## Nvec-vc1.1-XM-048719986.1,
## Nvec-vc1.1-XM-032365166.2,
## Nvec-vc1.1-XM-032382396.2,
## Nvec-vc1.1-XM-048733773.1,
## Nvec-vc1.1-XM-001638670.3,
## Nvec-vc1.1-XM-048721511.1,
## Nvec-vc1.1-XM-001625058.3,
## Nvec-vc1.1-XM-048732149.1,
## Nvec-vc1.1-XM-032362450.2,
## Nvec-vc1.1-XM-001632417.3,
## Nvec-vc1.1-XM-048724378.1,
## Nvec-vc1.1-XM-048733594.1,
## Nvec-vc1.1-XM-048726908.1,
## Nvec-vc1.1-XM-048721969.1,
## Nvec-vc1.1-XM-032366663.2,
## Nvec-vc1.1-XM-048727754.1,
## Nvec-vc1.1-XM-032376794.2,
## Nvec-vc1.1-XM-032369105.2,
## Nvec-vc1.1-XM-048722632.1,
## Nvec-vc1.1-XM-032382594.2,
## Nvec-vc1.1-XM-001623940.3,
## Nvec-vc1.1-XM-032366717.2,
## Nvec-vc1.1-XM-048722708.1,
## Nvec-vc1.1-XM-048727865.1,
## Nvec-vc1.1-XM-001628635.3,
## Nvec-vc1.1-XM-001629582.3,
## Nvec-vc1.1-XM-032381829.2,
## Nvec-vc1.1-XM-001640585.3,
## Nvec-vc1.1-XM-048726418.1,
## Nvec-vc1.1-XM-048723605.1,
## Nvec-vc1.1-XM-048722462.1,
## Nvec-vc1.1-XM-001641757.3,
## Nvec-vc1.1-XM-032369921.2, Nvec-v1g201852,
## Nvec-vc1.1-XM-032376742.2,
## Nvec-vc1.1-XM-001639153.3, Nvec-NVE124,
## Nvec-vc1.1-XM-032365212.2,
## Nvec-vc1.1-XM-001628985.3,
## Nvec-vc1.1-XM-032367365.2,
## Nvec-vc1.1-XM-032373809.2,
## Nvec-vc1.1-XM-001637262.3,
## Nvec-vc1.1-XM-032380887.2,
## Nvec-vc1.1-XM-032372092.2,
## Nvec-vc1.1-XM-001628645.3,
## Nvec-vc1.1-XM-032367467.2,
## Nvec-vc1.1-XM-032367697.2,
## Nvec-vc1.1-XM-048733400.1,
## Nvec-vc1.1-XM-032364244.2,
## Nvec-vc1.1-XM-001631285.3,
## Nvec-vc1.1-XM-001637408.3,
## Nvec-vc1.1-XM-048733066.1,
## Nvec-vc1.1-XM-032384571.2,
## Nvec-vc1.1-XM-048732210.1,
## Nvec-vc1.1-XM-048720903.1,
## Nvec-vc1.1-XM-001634705.3,
## Nvec-vc1.1-XM-032383554.2,
## Nvec-vc1.1-XM-032365155.2,
## Nvec-vc1.1-XM-048730525.1,
## Nvec-vc1.1-XM-048734399.1,
## Nvec-vc1.1-XM-048728423.1,
## Nvec-vc1.1-XM-048727388.1,
## Nvec-vc1.1-XM-032383541.2,
## Nvec-vc1.1-XM-032363582.2,
## Nvec-vc1.1-XM-048722760.1,
## Nvec-vc1.1-XM-032380628.2,
## Nvec-vc1.1-XM-032378393.2,
## Nvec-vc1.1-XM-032370897.2,
## Nvec-vc1.1-XM-048733829.1, Nvec-NVE16333,
## Nvec-vc1.1-XM-001630539.3,
## Nvec-vc1.1-XM-048726173.1,
## Nvec-vc1.1-XM-032365240.2,
## Nvec-vc1.1-XM-032381688.2,
## Nvec-vc1.1-XM-048731247.1, Nvec-NVE5061,
## Nvec-vc1.1-XM-048729213.1,
## Nvec-vc1.1-XM-032378649.2,
## Nvec-vc1.1-XM-001640934.3,
## Nvec-vc1.1-XM-001629620.3,
## Nvec-vc1.1-XM-032374118.2,
## Nvec-vc1.1-XM-048729133.1,
## Nvec-vc1.1-XM-001634592.3,
## Nvec-vc1.1-XM-032379957.2, Nvec-NVE13521,
## Nvec-vc1.1-XM-032366892.2, Nvec-NVE8123,
## Nvec-vc1.1-XM-0

# Volcano plot
df <- merged_df
library(tidyverse)
filtered_df <- df %>%
group_by(protein) %>%
filter(abs(avg_log2FC) == max(abs(avg_log2FC))) %>%
ungroup() %>% na.omit()
colors <- rep("#BFBFBF", nrow(filtered_df))
names(colors) <- rep("Negative", nrow(filtered_df))
colors[which(filtered_df$avg_log2FC >= 1 &
filtered_df$p_val_adj < 0.05)] <- "#E69F00"
names(colors) [which(filtered_df$avg_log2FC >= 1 &
filtered_df$p_val_adj < 0.05)] <- "UP"
colors[which(filtered_df$avg_log2FC <= -1 &
filtered_df$p_val_adj < 0.05)] <- "#990099"
names(colors) [which(filtered_df$avg_log2FC <= -1 &
filtered_df$p_val_adj < 0.05)] <- "DOWN"
library(EnhancedVolcano)
volcano_plot <- EnhancedVolcano(
filtered_df,
lab = filtered_df$protein,
x = "avg_log2FC",
y = "p_val_adj",
pCutoff = 0.05,
FCcutoff = 1,
colCustom = colors,
colAlpha = 0.3,
title = "",
subtitle = "",
caption = "",
drawConnectors = TRUE,
widthConnectors = 0.75,
ylim = c(NA, 100),
max.overlaps = 15,
labSize = 3.0
)
## Warning: One or more p-values is 0.
## Converting to 10^-1 * current lowest
## non-zero p-value...
# Save the plot as a PDF
ggsave(
filename = "~/immune_cells/scRNAseq_analysis/seurat_pipeline/figs_publication/MAST_volcano_corrected.pdf",
# File name
plot = volcano_plot,
# The plot object
device = "pdf",
# File format
width = 8,
height = 6,
# Dimensions in inches
dpi = 300 # Resolution (not relevant for PDFs but ensures sharpness)
)
## Warning: ggrepel: 1669 unlabeled data points
## (too many overlaps). Consider increasing
## max.overlaps

# Immune cluster characterization ----------------------------------------
# =============================================================================
# Cluster 1 markers, visualization, and enrichment analysis
# =============================================================================
library(Seurat) library(ggplot2) library(clusterProfiler) library(enrichplot)
seurat_rds \<- "\~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS" go_dir \<- "\~/immune_cells/scRNAseq_analysis/annotaion/GOseq" out_dir \<- "\~/immune_cells/mCherry_RLRb_FACS/Differential_expression/figs_publication" dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
seurat_obj \<- readRDS(seurat_rds)
# =============================================================================
# 1) Differential expression: Cluster 1 markers
# =============================================================================
cluster_id \<- 1
cluster1_markers \<- FindMarkers( object = seurat_obj, ident.1 = cluster_id, min.pct = 0.25, logfc.threshold = 0.25 )
# Quick peek at top markers
head(cluster1_markers)
## p_val avg_log2FC
## Nvec-vc1.1-XM-001624274.3 0 5.619424
## Nvec-vc1.1-XM-001632975.3 0 4.609391
## Nvec-vc1.1-XM-001637235.3 0 4.544092
## Nvec-vc1.1-XM-032376004.2 0 3.592725
## Nvec-vc1.1-XM-032385537.2 0 6.242463
## Nvec-vc1.1-XM-048733148.1 0 5.136621
## pct.1 pct.2
## Nvec-vc1.1-XM-001624274.3 0.838 0.079
## Nvec-vc1.1-XM-001632975.3 0.917 0.186
## Nvec-vc1.1-XM-001637235.3 0.823 0.108
## Nvec-vc1.1-XM-032376004.2 0.897 0.200
## Nvec-vc1.1-XM-032385537.2 0.689 0.055
## Nvec-vc1.1-XM-048733148.1 0.654 0.036
## p_val_adj
## Nvec-vc1.1-XM-001624274.3 0
## Nvec-vc1.1-XM-001632975.3 0
## Nvec-vc1.1-XM-001637235.3 0
## Nvec-vc1.1-XM-032376004.2 0
## Nvec-vc1.1-XM-032385537.2 0
## Nvec-vc1.1-XM-048733148.1 0
# =============================================================================
# 2) Heatmap: top markers for Cluster 1
# =============================================================================
n_top_markers \<- 50 top_markers \<- rownames(cluster1_markers)[seq_len(min(n_top_markers, nrow(cluster1_markers)))]
heatmap_plot \<- DoHeatmap( object = seurat_obj, features = top_markers, group.by = "seurat_clusters", size = 4 ) + scale_fill_gradientn(colors = c("blue", "white", "red")) + ggtitle(sprintf("Top %d Markers for Cluster %s", length(top_markers), cluster_id)) + theme_minimal(base_size = 14) + theme( plot.title = element_text(hjust = 0.5, face = "bold", size = 16), legend.position = "right", legend.title = element_blank() )
## Scale for fill is already present.
## Adding another scale for fill, which will
## replace the existing scale.
print(heatmap_plot)

#ggsave(
# filename = file.path(out_dir, "cluster1_markers_heatmap.png"),
# plot = heatmap_plot,
# width = 8,
# height = 6,
# dpi = 300
#)
# =============================================================================
# 3) Expression visualization: selected immune genes across clusters
# =============================================================================
genes_of_interest <- c(
"mCherry-plus-strand",
"Nvec-vc1.1-XM-048731786.1",
"Nvec-vc1.1-XM-032361976.2",
"Nvec-vc1.1-XM-048730161.1"
)
vln_plot <- VlnPlot(
object = seurat_obj,
features = genes_of_interest,
pt.size = 0,
group.by = "seurat_clusters",
ncol = 1,
combine = TRUE
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(vln_plot)

# =============================================================================
# 5) Prepare ranked gene list (for GSEA-style inputs)
# =============================================================================
gene_list <- cluster1_markers$avg_log2FC
names(gene_list) <- gsub("-", "_", rownames(cluster1_markers))
gene_list <- na.omit(gene_list)
gene_list <- sort(gene_list, decreasing = TRUE)
# =============================================================================
# 7) Load GO annotation tables (TERM2GENE / TERM2NAME)
# =============================================================================
term2gene_path <- file.path(go_dir, "GoName_NewAnnotation.csv")
term2name_path <- file.path(go_dir, "Goterms_NewAnnotation.csv")
TermGene <- read.csv(term2gene_path, header = TRUE, check.names = FALSE)
TermName <- read.csv(term2name_path, header = TRUE, check.names = FALSE)
stopifnot(is.data.frame(TermGene), is.data.frame(TermName))
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
# Filter significant upregulated genes
keep <- cluster1_markers$p_val_adj < 0.05 & cluster1_markers$avg_log2FC > 0.5
genes <- rownames(cluster1_markers[keep, , drop = FALSE])
genes <- gsub("-", "_", genes)
genes <- na.omit(genes)
# Load annotation tables
go_dir <- "~/immune_cells/scRNAseq_analysis/annotaion/GOseq"
TermGene <- read.csv(file.path(go_dir, "GoName_NewAnnotation.csv"), header = TRUE, check.names = FALSE)
TermName <- read.csv(file.path(go_dir, "Goterms_NewAnnotation.csv"), header = TRUE, check.names = FALSE)
# Run ORA
Results <- enricher(
gene = genes,
TERM2GENE = TermGene,
TERM2NAME = TermName,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
qvalueCutoff = 0.2,
minGSSize = 10,
maxGSSize = 500
)
# Visualize
p = barplot(Results, showCategory = 15)
# Rotate the labels by 45 degrees
p + theme(axis.text.y = element_text(angle = 45, hjust = 1))

# Save table
write.table(
as.data.frame(Results),
file = "~/immune_cells/mCherry_RLRb_FACS/Differential_expression/figs_publication/mCherry_upregulated_ORA.txt",
sep = "\t",
quote = FALSE,
row.names = FALSE
)
# Finding activation and basal condition specific markers within the immune cluster
# ==========================================================
# Cluster 1 "immune" DE (tPIC vs Ctrl) + GO-based labeling
# Assign each DE gene to TF / Receptor / Effector / Other
# - Produce: (1) DotPlot of top genes per category, (2) compareCluster ORA
# ==========================================================
suppressPackageStartupMessages({
library(Seurat)
library(dplyr)
library(tidyr)
library(ggplot2)
library(clusterProfiler)
})
# Inputs
seurat_obj <- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
gene_names_df <- readRDS("~/immune_cells/scRNAseq_analysis/annotaion/peptides_annotation.rds")
TermGene <- read.csv(
file = "~/immune_cells/scRNAseq_analysis/annotaion/GOseq/GoName_NewAnnotation.csv",
header = TRUE,
check.names = FALSE
)
TermName <- read.csv(
file = "~/immune_cells/scRNAseq_analysis/annotaion/GOseq/Goterms_NewAnnotation.csv",
header = TRUE,
check.names = FALSE
)
# Parameters I am interested in
cluster_id <- "1"
cond1 <- "tPIC"
cond2 <- "Ctrl"
# DE settings (FindMarkers)
logfc_thresh <- 0.25
min_pct <- 0.10
# What you call "strong" DE for downstream summaries
de_fc_cutoff <- 0.5
padj_cutoff <- 0.05
# How many representatives to show per category
top_n <- 3
# Outputs (same as your original intent)
out_tpic_csv <- "~/immune_cells/figures_revised/tpic_up_df.csv"
out_ctrl_csv <- "~/immune_cells/figures_revised/ctrl_up_df.csv"
out_portrait <- "~/immune_cells/figures_revised/molecular_portrait.csv"
# Helper: pick top N genes per category (by absolute effect size)
get_top_genes <- function(df, direction, n = 3) {
df %>%
group_by(category) %>%
slice_max(order_by = abs(avg_log2FC), n = n, with_ties = FALSE) %>%
ungroup() %>%
mutate(condition = direction)
}
# ==========================================================
# 1) Subset the immune cluster and run DE between conditions
# ==========================================================
immune_cells <- subset(seurat_obj, idents = cluster_id)
# Use Condition as the identity class (so FindMarkers compares conditions)
Idents(immune_cells) <- immune_cells$Condition
de_genes <- FindMarkers(
immune_cells,
ident.1 = cond1,
ident.2 = cond2,
logfc.threshold = logfc_thresh,
min.pct = min_pct
)
# ==========================================================
# 2) Build a compact "gene -> GO term names" lookup table
# (so each gene gets a single text field we can keyword-scan)
# ==========================================================
gene_go <- TermGene %>%
inner_join(TermName, by = "Go_Term") %>%
transmute(
gene = Ids, # TERM2GENE IDs (underscore format)
GO_name = Name
) %>%
group_by(gene) %>%
summarise(
GO_combined = paste(unique(GO_name), collapse = "; "),
.groups = "drop"
)
# ==========================================================
# 3) Attach GO text + peptide annotations to the DE table
# Then label each gene into coarse functional buckets.
#
# ==========================================================
de_annot <- de_genes %>%
tibble::rownames_to_column("gene_raw") %>%
mutate(
# Normalize for GO join: GO uses underscores
gene = gsub("-", "_", gene_raw)) %>%
left_join(gene_go, by = "gene") %>%
mutate(
category = dplyr::case_when(
grepl("transcription factor|DNA-binding|regulation of transcription",
GO_combined, ignore.case = TRUE) ~ "TF",
grepl("receptor activity|signaling receptor|G protein-coupled receptor|membrane",
GO_combined, ignore.case = TRUE) ~ "Receptor",
grepl("immune|defense|cytokine|interferon|effector",
GO_combined, ignore.case = TRUE) ~ "Effector",
TRUE ~ "Other"
)
) %>%
left_join(gene_names_df, by = c("gene_raw" = "gene_name"))
message("Category counts:")
## Category counts:
print(table(de_annot$category, useNA = "ifany"))
##
## Effector Other Receptor TF
## 54 3638 396 235
message("Peptide annotation coverage:")
## Peptide annotation coverage:
message(" annotated rows: ", sum(!is.na(de_annot$protein) | !is.na(de_annot$domain), na.rm = TRUE))
## annotated rows: 4323
message(" total rows: ", nrow(de_annot))
## total rows: 4323
# ==========================================================
# 4) Define the "up" gene sets
# (strong log2foldchange + significant adjusted p-value)
# ==========================================================
tpic_up <- de_annot %>%
filter(avg_log2FC > de_fc_cutoff, p_val_adj < padj_cutoff)
ctrl_up <- de_annot %>%
filter(avg_log2FC < -de_fc_cutoff, p_val_adj < padj_cutoff)
# Save tables
#write.csv(tpic_up, out_tpic_csv, row.names = FALSE)
#write.csv(ctrl_up, out_ctrl_csv, row.names = FALSE)
# ==========================================================
# 5) Quick visual sanity check: DotPlot of representative genes
# We show the strongest hits per category for each direction.
# ==========================================================
top_genes <- bind_rows(
get_top_genes(tpic_up, cond1, n = top_n),
get_top_genes(ctrl_up, cond2, n = top_n)
) %>%
arrange(condition, category)
# DotPlot needs Seurat's feature names -> use gene_raw
genes_to_plot <- unique(top_genes$gene_raw)
p_dot <- DotPlot(immune_cells, features = genes_to_plot, group.by = "Condition") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste0("Cluster ", cluster_id, ": top DE genes by GO-category (", cond1, " vs ", cond2, ")"))
## Warning: Scaling data with a low number of
## groups may produce misleading results
print(p_dot)

# ==========================================================
# 6) ORA: compare GO enrichment between the two "up" sets
# Here I use underscore-normalized IDs to match TERM2GENE.
# ==========================================================
gene_lists <- list(
tPIC_up = unique(tpic_up$gene),
Ctrl_up = unique(ctrl_up$gene)
)
cc_result <- compareCluster(
geneCluster = gene_lists,
fun = "enricher",
TERM2GENE = TermGene,
TERM2NAME = TermName,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.2
)
p_cc <- dotplot(cc_result, showCategory = 10, font.size = 12) +
ggtitle("GO term enrichment comparison: tPIC-up vs Ctrl-up")
print(p_cc)

# ==========================================================
# 7) Save the full annotated DE table ("molecular portrait")
# ==========================================================
write.csv(de_annot, out_portrait, row.names = FALSE)
message(
"Finished.\n",
"Wrote:\n",
" - ", out_tpic_csv, "\n",
" - ", out_ctrl_csv, "\n",
" - ", out_portrait
)
## Finished.
## Wrote:
## - ~/immune_cells/figures_revised/tpic_up_df.csv
## - ~/immune_cells/figures_revised/ctrl_up_df.csv
## - ~/immune_cells/figures_revised/molecular_portrait.csv
# Comparison with the bulk-RNA-seq results --------------------------------
# Compute module score for the mCherry positive cells and visualize the bulk-RNA-seq.
# Where the mCherry+ genes are expressed?
seurat<- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
# Load the mCherry+ upregulated genes
gene_set <- read.delim(
"~/immune_cells/mCherry_RLRb_FACS/Differential_expression/mCherry_upregulated.txt"
) # Replace the the names from underscore to dash
gene_set <- rownames(gene_set)
gene_set <- gsub("_", "-", gene_set)
# Check if genes exist in the dataset
gene_set <- intersect(gene_set, rownames(seurat_obj))
# Add module score for the gene set
seurat_obj <- AddModuleScore(seurat, features = list(gene_set), name = "ProcessScore")
# Visualize the module score on UMAP
aggregated_score <- FeaturePlot_scCustom(
seurat_obj,
features = "ProcessScore1",
reduction = "umap",
pt.size = 0.5,
colors_use = viridis_inferno_dark_high
) + ggtitle("") + theme(axis.title.x = element_blank(), axis.title.y = element_blank())
## Warning: Some of the plotted features are from
## meta.data slot.
## • Please check that `na_cutoff` param is
## being set appropriately for those
## features.
# Display the plot
print(aggregated_score)

# Save the plot as high-resolution output
ggsave(
"aggregated_mCherry_score_umap.png",
plot = aggregated_score,
width = 8,
height = 6,
dpi = 300
)
# Identification of marker genes and annotation ---------------------------
# Cell type markers S6
# All marker genes
cellType_markers <- read.csv(
"~/immune_cells/scRNAseq_analysis/seurat_pipeline/figs_publication/supplementary/cellType_markers3.csv"
)
# Read the clustered Seurat object
#seurat<- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
# Add immune genes
# Replace underscores with dashes in the entire data frame
df_markers <- data.frame(lapply(cellType_markers, function(x) {
if (is.character(x) || is.factor(x)) {
gsub("_", "-", as.character(x)) # Replace in character or factor columns
} else {
x # Leave other columns unchanged
}
}), stringsAsFactors = FALSE)
df_markers <- df_markers[1:17, ]
# Plotting
library(patchwork)
visualize_markers <- function(seurat, df_markers) {
plots <- lapply(seq_len(nrow(df_markers)), function(i) {
gene <- df_markers$DToL[i]
model <- df_markers$ID[i]
name <- df_markers$Marker[i]
# Check if the gene exists in the Seurat object
if (gene %in% rownames(seurat)) {
FeaturePlot_scCustom(seurat, features = gene) +
ggtitle(paste(name, ":", model)) +
theme(plot.title = element_text(size = 8)) +
theme(axis.title = element_blank())
} else {
message(paste("Skipping", gene, "as it is not found in the Seurat object"))
NULL
}
})
# Remove NULL entries from the plot list
plots <- Filter(Negate(is.null), plots)
# Combine plots if any are available
if (length(plots) > 0) {
wrap_plots(plots, ncol = 4)
} else {
message("No valid markers found for visualization.")
}
}
visualize_markers(seurat = seurat, df_markers = df_markers)
## Skipping Nvec-vc1.1-XM-001628041.3 as it is not found in the Seurat object

S6 <- visualize_markers(seurat = seurat, df_markers = df_markers)
## Skipping Nvec-vc1.1-XM-001628041.3 as it is not found in the Seurat object
ggsave(
"S6_marker_genes.png",
plot = S6,
width = 12,
height = 9,
dpi = 300
)
# Annotate cell types and re-print figure
Idents(seurat) <- "seurat_clusters"
table(Idents(seurat))
##
## 0 1 2 3 4 5 6
## 5103 3225 950 651 524 410 202
# Convert to characters
Idents(seurat) <- factor(as.character(seurat$seurat_clusters))
levels(Idents(seurat))
## [1] "0" "1" "2" "3" "4" "5" "6"
cell_types <- c(
"0" = "Ectoderm",
"1" = "Immune",
"2" = "Ect.oral",
"3" = "Nemtocye",
"4" = "Ect.aboral",
"5" = "Gland",
"6" = "Neuronal"
)
cell_types_num <- setNames(paste0(names(cell_types), ": ", cell_types), names(cell_types))
seurat <- RenameIdents(seurat, cell_types_num)
table(Idents(seurat))
##
## 0: Ectoderm 1: Immune 2: Ect.oral
## 5103 3225 950
## 3: Nemtocye 4: Ect.aboral 5: Gland
## 651 524 410
## 6: Neuronal
## 202
levels(Idents(seurat))
## [1] "0: Ectoderm" "1: Immune"
## [3] "2: Ect.oral" "3: Nemtocye"
## [5] "4: Ect.aboral" "5: Gland"
## [7] "6: Neuronal"
DimPlot(seurat)

# Save as RDS file (available on Zenodo)
#saveRDS(
# seurat,
# "~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_obj2_annotated.rds"
#)
# Visualizing the WGCNA metacell results ---------------------------------
library(Seurat)
library(dplyr)
seurat<- readRDS("~/immune_cells/scRNAseq_analysis/seurat_pipeline/seurat_clust.RDS")
modules<- read.delim("/sci/labs/yehum79/itamar273/scRNAseq Arnau/to_yehu/gene_modules/WGCNA_gmod_annotation.txt")
# seurat: Clustered Seurat object
# modules: WGCNA data.frame with columns gene_id, gene_module, membership_score, ...
# Make sure clustering identities are the active identities
Idents(seurat) <- "seurat_clusters"
# Apply the same order as the WGCNA compact heatmap (Supplementary Fig. 7a)
gs_order <- c(
"turquoise", "pink", "grey60", "salmon", "cyan",
"grey", "green", "yellow", "tan", "greenyellow",
"blue", "magenta", "black", "red", "purple",
"lightcyan", "brown", "lightgreen", "midnightblue"
)
GS_names <- paste0("GS", seq_along(gs_order))
modules$gene_id<- gsub("_","-", modules$gene_id)
library(dplyr)
gene_sets_df <- modules %>%
filter(gene_module %in% gs_order) %>%
filter(gene_id %in% rownames(seurat)) %>%
distinct(gene_module, gene_id) %>%
group_by(gene_module) %>%
summarise(genes = list(gene_id), .groups = "drop")
gene_sets <- setNames(gene_sets_df$genes, gene_sets_df$gene_module)
gene_sets <- gene_sets[gs_order]
stopifnot(
length(gene_sets) == 19,
all(names(gene_sets) == gs_order)
)
seurat <- AddModuleScore(
object = seurat,
features = gene_sets,
name = "GS_",
assay = DefaultAssay(seurat),
slot = "data"
)
meta <- seurat[[]]
old <- paste0("GS_", seq_along(gs_order))
new <- paste0("GS", seq_along(gs_order))
colnames(meta)[match(old, colnames(meta))] <- new
seurat[[]] <- meta
all(new %in% colnames(seurat[[]])) # TRUE
## [1] TRUE
vln_plot <- VlnPlot(
seurat,
features = new,
group.by = "seurat_clusters",
pt.size = 0.1,
ncol = 7,
alpha = 0.3,
cols = rep("grey",7),
)
print(vln_plot)

# Conduct over-representation analysis on individual gene sets
library(dplyr)
library(clusterProfiler)
setwd("~/immune_cells/scRNAseq_analysis/annotaion/GOseq/")
TermGene <- read.csv("GoName_NewAnnotation.csv", header = TRUE, check.names = FALSE)
TermName <- read.csv("Goterms_NewAnnotation.csv", header = TRUE, check.names = FALSE)
out_dir <- "~/immune_cells/Supp_tables/ORA_GS14_GS17/"
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
stopifnot(is.data.frame(TermGene), is.data.frame(TermName))
gs_to_module <- c(
GS14 = "red",
GS15 = "purple",
GS16 = "lightcyan",
GS17 = "brown"
)
modules$gene_id<- gsub("-","_", modules$gene_id)
ora_results <- list()
ora_plots <- list()
for (gs in names(gs_to_module)) {
mod <- gs_to_module[[gs]]
genes <- modules %>%
filter(gene_module == mod) %>%
pull(gene_id) %>%
unique()
res <- enricher(
gene = genes,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
qvalueCutoff = 0.2,
minGSSize = 10,
maxGSSize = 500,
TERM2GENE = TermGene,
TERM2NAME = TermName
)
ora_results[[gs]] <- res
# only make a plot if enrichment exists
if (!is.null(res) && nrow(as.data.frame(res)) > 0) {
ora_plots[[gs]] <- barplot(
res,
showCategory = 15,
title = paste0(gs, " (", mod, ")")
)
} else {
ora_plots[[gs]] <- NULL
}
}
print(ora_plots$GS14)

print(ora_plots$GS15)

print(ora_plots$GS16)

print(ora_plots$GS17)

---
## Session information
```text
R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8
[2] LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8
[8] LC_NAME=C
[9] LC_ADDRESS=C
[10] LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8
[12] LC_IDENTIFICATION=C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats4 grid
[3] stats graphics
[5] grDevices utils
[7] datasets methods
[9] base
other attached packages:
[1] patchwork_1.3.0
[2] enrichplot_1.24.4
[3] EnhancedVolcano_1.22.0
[4] ggrepel_0.9.6
[5] lubridate_1.9.3
[6] forcats_1.0.0
[7] stringr_1.5.1
[8] purrr_1.0.2
[9] readr_2.1.5
[10] tidyr_1.3.1
[11] tibble_3.2.1
[12] tidyverse_2.0.0
[13] scCustomize_3.0.1
[14] MAST_1.30.0
[15] SingleCellExperiment_1.26.0
[16] SummarizedExperiment_1.34.0
[17] Biobase_2.64.0
[18] GenomicRanges_1.56.2
[19] GenomeInfoDb_1.40.1
[20] IRanges_2.38.1
[21] S4Vectors_0.42.1
[22] BiocGenerics_0.50.0
[23] MatrixGenerics_1.16.0
[24] matrixStats_1.4.1
[25] ggpubr_0.6.0
[26] ggplot2_3.5.1
[27] Matrix_1.7-1
[28] clusterProfiler_4.12.6
[29] dplyr_1.1.4
[30] Seurat_5.1.0
[31] SeuratObject_5.0.2
[32] sp_2.1-4
loaded via a namespace (and not attached):
[1] R.methodsS3_1.8.2
[2] goftest_1.2-3
[3] Biostrings_2.72.1
[4] vctrs_0.6.5
[5] spatstat.random_3.3-2
[6] digest_0.6.37
[7] png_0.1-8
[8] shape_1.4.6.1
[9] deldir_2.0-4
[10] parallelly_1.38.0
[11] MASS_7.3-61
[12] reshape2_1.4.4
[13] httpuv_1.6.15
[14] foreach_1.5.2
[15] qvalue_2.36.0
[16] withr_3.0.1
[17] ggrastr_1.0.2
[18] xfun_0.48
[19] ggfun_0.1.7
[20] survival_3.7-0
[21] memoise_2.0.1
[22] ggbeeswarm_0.7.2
[23] janitor_2.2.1
[24] gson_0.1.0
[25] systemfonts_1.1.0
[26] tidytree_0.4.6
[27] zoo_1.8-15
[28] GlobalOptions_0.1.2
[29] pbapply_1.7-2
[30] R.oo_1.26.0
[31] Formula_1.2-5
[32] rematch2_2.1.2
[33] KEGGREST_1.44.1
[34] promises_1.3.0
[35] httr_1.4.7
[36] rstatix_0.7.2
[37] globals_0.16.3
[38] fitdistrplus_1.2-1
[39] rstudioapi_0.16.0
[40] UCSC.utils_1.0.0
[41] miniUI_0.1.1.1
[42] generics_0.1.3
[43] DOSE_3.30.5
[44] zlibbioc_1.50.0
[45] ggraph_2.2.1
[46] polyclip_1.10-7
[47] GenomeInfoDbData_1.2.12
[48] SparseArray_1.4.8
[49] xtable_1.8-4
[50] doParallel_1.0.17
[51] evaluate_1.0.1
[52] S4Arrays_1.4.1
[53] hms_1.1.3
[54] irlba_2.3.5.1
[55] colorspace_2.1-1
[56] ROCR_1.0-11
[57] reticulate_1.39.0
[58] spatstat.data_3.1-2
[59] magrittr_2.0.3
[60] lmtest_0.9-40
[61] snakecase_0.11.1
[62] later_1.3.2
[63] viridis_0.6.5
[64] ggtree_3.12.0
[65] lattice_0.22-6
[66] spatstat.geom_3.3-3
[67] future.apply_1.11.2
[68] scattermore_1.2
[69] shadowtext_0.1.4
[70] cowplot_1.1.3
[71] RcppAnnoy_0.0.22
[72] pillar_1.9.0
[73] nlme_3.1-166
[74] iterators_1.0.14
[75] compiler_4.4.1
[76] RSpectra_0.16-2
[77] stringi_1.8.4
[78] tensor_1.5
[79] plyr_1.8.9
[80] crayon_1.5.3
[81] abind_1.4-8
[82] gridGraphics_0.5-1
[83] locfit_1.5-9.10
[84] graphlayouts_1.2.0
[85] bit_4.5.0
[86] fastmatch_1.1-4
[87] codetools_0.2-20
[88] textshaping_0.4.0
[89] bslib_0.8.0
[90] paletteer_1.6.0
[91] GetoptLong_1.0.5
[92] plotly_4.10.4
[93] mime_0.12
[94] splines_4.4.1
[95] circlize_0.4.16
[96] Rcpp_1.0.14
[97] fastDummies_1.7.4
[98] knitr_1.48
[99] blob_1.2.4
[100] utf8_1.2.4
[101] clue_0.3-65
[102] fs_1.6.4
[103] listenv_0.9.1
[104] ggsignif_0.6.4
[105] ggplotify_0.1.2
[106] statmod_1.5.0
[107] tzdb_0.4.0
[108] tweenr_2.0.3
[109] pkgconfig_2.0.3
[110] tools_4.4.1
[111] cachem_1.1.0
[112] RSQLite_2.3.7
[113] viridisLite_0.4.2
[114] DBI_1.2.3
[115] fastmap_1.2.0
[116] rmarkdown_2.28
[117] scales_1.3.0
[118] ica_1.0-3
[119] broom_1.0.7
[120] sass_0.4.9
[121] ggprism_1.0.5
[122] dotCall64_1.2
[123] carData_3.0-5
[124] RANN_2.6.2
[125] farver_2.1.2
[126] tidygraph_1.3.1
[127] scatterpie_0.2.4
[128] mgcv_1.9-1
[129] yaml_2.3.10
[130] cli_3.6.3
[131] leiden_0.4.3.1
[132] lifecycle_1.0.4
[133] uwot_0.2.2
[134] presto_1.0.0
[135] backports_1.5.0
[136] BiocParallel_1.38.0
[137] timechange_0.3.0
[138] gtable_0.3.5
[139] rjson_0.2.23
[140] ggridges_0.5.6
[141] progressr_0.14.0
[142] parallel_4.4.1
[143] ape_5.8
[144] limma_3.60.6
[145] jsonlite_1.8.9
[146] RcppHNSW_0.6.0
[147] bit64_4.5.2
[148] Rtsne_0.17
[149] yulab.utils_0.1.7
[150] spatstat.utils_3.1-0
[151] jquerylib_0.1.4
[152] highr_0.11
[153] GOSemSim_2.30.2
[154] spatstat.univar_3.0-1
[155] R.utils_2.12.3
[156] lazyeval_0.2.2
[157] shiny_1.9.1
[158] htmltools_0.5.8.1
[159] GO.db_3.19.1
[160] sctransform_0.4.1
[161] rappdirs_0.3.3
[162] glue_1.8.0
[163] ggvenn_0.1.10
[164] spam_2.11-0
[165] httr2_1.0.5
[166] XVector_0.44.0
[167] treeio_1.28.0
[168] gridExtra_2.3
[169] igraph_2.0.3
[170] R6_2.5.1
[171] DESeq2_1.44.0
[172] labeling_0.4.3
[173] cluster_2.1.6
[174] pkgload_1.4.0
[175] aplot_0.2.3
[176] DelayedArray_0.30.1
[177] tidyselect_1.2.1
[178] vipor_0.4.7
[179] ggforce_0.4.2
[180] car_3.1-3
[181] AnnotationDbi_1.66.0
[182] future_1.34.0
[183] munsell_0.5.1
[184] KernSmooth_2.23-24
[185] data.table_1.16.2
[186] htmlwidgets_1.6.4
[187] fgsea_1.30.0
[188] ComplexHeatmap_2.20.0
[189] RColorBrewer_1.1-3
[190] rlang_1.1.4
[191] spatstat.sparse_3.1-0
[192] spatstat.explore_3.3-2
[193] fansi_1.0.6
[194] beeswarm_0.4.0